library(tidyverse)
library(GGally)   # ggplot extensions
library(caret)
library(magrittr)
library(skimr)
library(janitor)  # for data cleaning purposes
library(glmnet)   # the main package for penalized linear models
library(broom)    # for tidying regression coefficient outputs
library(knitr)
library(kableExtra)  # for nicer tables in rmarkdown

theme_set(theme_bw())   # globally set ggplot theme

Disclaimer: this lab borrows lots of ideas and examples from this great tutorial.

Data: Ames Housing dataset

We are going to predict house sales prices in Ames, Iowa. See here.

library(AmesHousing)
housing_data <- make_ames()
housing_data <- clean_names(housing_data) # from janiter,所有分隔符和大小写都被统一了
housing_data <- housing_data %>% mutate(log_sale_price = log(sale_price))
skim(housing_data)
Data summary
Name housing_data
Number of rows 2930
Number of columns 82
_______________________
Column type frequency:
factor 46
numeric 36
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
ms_sub_class 0 1 FALSE 16 One: 1079, Two: 575, One: 287, One: 192
ms_zoning 0 1 FALSE 7 Res: 2273, Res: 462, Flo: 139, Res: 27
street 0 1 FALSE 2 Pav: 2918, Grv: 12
alley 0 1 FALSE 3 No_: 2732, Gra: 120, Pav: 78
lot_shape 0 1 FALSE 4 Reg: 1859, Sli: 979, Mod: 76, Irr: 16
land_contour 0 1 FALSE 4 Lvl: 2633, HLS: 120, Bnk: 117, Low: 60
utilities 0 1 FALSE 3 All: 2927, NoS: 2, NoS: 1
lot_config 0 1 FALSE 5 Ins: 2140, Cor: 511, Cul: 180, FR2: 85
land_slope 0 1 FALSE 3 Gtl: 2789, Mod: 125, Sev: 16
neighborhood 0 1 FALSE 28 Nor: 443, Col: 267, Old: 239, Edw: 194
condition_1 0 1 FALSE 9 Nor: 2522, Fee: 164, Art: 92, RRA: 50
condition_2 0 1 FALSE 8 Nor: 2900, Fee: 13, Art: 5, Pos: 4
bldg_type 0 1 FALSE 5 One: 2425, Twn: 233, Dup: 109, Twn: 101
house_style 0 1 FALSE 8 One: 1481, Two: 873, One: 314, SLv: 128
overall_qual 0 1 FALSE 10 Ave: 825, Abo: 732, Goo: 602, Ver: 350
overall_cond 0 1 FALSE 9 Ave: 1654, Abo: 533, Goo: 390, Ver: 144
roof_style 0 1 FALSE 6 Gab: 2321, Hip: 551, Gam: 22, Fla: 20
roof_matl 0 1 FALSE 8 Com: 2887, Tar: 23, WdS: 9, WdS: 7
exterior_1st 0 1 FALSE 16 Vin: 1026, Met: 450, HdB: 442, Wd : 420
exterior_2nd 0 1 FALSE 17 Vin: 1015, Met: 447, HdB: 406, Wd : 397
mas_vnr_type 0 1 FALSE 5 Non: 1775, Brk: 880, Sto: 249, Brk: 25
exter_qual 0 1 FALSE 4 Typ: 1799, Goo: 989, Exc: 107, Fai: 35
exter_cond 0 1 FALSE 5 Typ: 2549, Goo: 299, Fai: 67, Exc: 12
foundation 0 1 FALSE 6 PCo: 1310, CBl: 1244, Brk: 311, Sla: 49
bsmt_qual 0 1 FALSE 6 Typ: 1283, Goo: 1219, Exc: 258, Fai: 88
bsmt_cond 0 1 FALSE 6 Typ: 2616, Goo: 122, Fai: 104, No_: 80
bsmt_exposure 0 1 FALSE 5 No: 1906, Av: 418, Gd: 284, Mn: 239
bsmt_fin_type_1 0 1 FALSE 7 GLQ: 859, Unf: 851, ALQ: 429, Rec: 288
bsmt_fin_type_2 0 1 FALSE 7 Unf: 2499, Rec: 106, LwQ: 89, No_: 81
heating 0 1 FALSE 6 Gas: 2885, Gas: 27, Gra: 9, Wal: 6
heating_qc 0 1 FALSE 5 Exc: 1495, Typ: 864, Goo: 476, Fai: 92
central_air 0 1 FALSE 2 Y: 2734, N: 196
electrical 0 1 FALSE 6 SBr: 2682, Fus: 188, Fus: 50, Fus: 8
kitchen_qual 0 1 FALSE 5 Typ: 1494, Goo: 1160, Exc: 205, Fai: 70
functional 0 1 FALSE 8 Typ: 2728, Min: 70, Min: 65, Mod: 35
fireplace_qu 0 1 FALSE 6 No_: 1422, Goo: 744, Typ: 600, Fai: 75
garage_type 0 1 FALSE 7 Att: 1731, Det: 782, Bui: 186, No_: 157
garage_finish 0 1 FALSE 4 Unf: 1231, RFn: 812, Fin: 728, No_: 159
garage_qual 0 1 FALSE 6 Typ: 2615, No_: 159, Fai: 124, Goo: 24
garage_cond 0 1 FALSE 6 Typ: 2665, No_: 159, Fai: 74, Goo: 15
paved_drive 0 1 FALSE 3 Pav: 2652, Dir: 216, Par: 62
pool_qc 0 1 FALSE 5 No_: 2917, Exc: 4, Goo: 4, Typ: 3
fence 0 1 FALSE 5 No_: 2358, Min: 330, Goo: 118, Goo: 112
misc_feature 0 1 FALSE 6 Non: 2824, She: 95, Gar: 5, Oth: 4
sale_type 0 1 FALSE 10 WD : 2536, New: 239, COD: 87, Con: 26
sale_condition 0 1 FALSE 6 Nor: 2413, Par: 245, Abn: 190, Fam: 46

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
lot_frontage 0 1 57.65 33.50 0.00 43.00 63.00 78.00 313.00 ▇▇▁▁▁
lot_area 0 1 10147.92 7880.02 1300.00 7440.25 9436.50 11555.25 215245.00 ▇▁▁▁▁
year_built 0 1 1971.36 30.25 1872.00 1954.00 1973.00 2001.00 2010.00 ▁▂▃▆▇
year_remod_add 0 1 1984.27 20.86 1950.00 1965.00 1993.00 2004.00 2010.00 ▅▂▂▃▇
mas_vnr_area 0 1 101.10 178.63 0.00 0.00 0.00 162.75 1600.00 ▇▁▁▁▁
bsmt_fin_sf_1 0 1 4.18 2.23 0.00 3.00 3.00 7.00 7.00 ▃▂▇▁▇
bsmt_fin_sf_2 0 1 49.71 169.14 0.00 0.00 0.00 0.00 1526.00 ▇▁▁▁▁
bsmt_unf_sf 0 1 559.07 439.54 0.00 219.00 465.50 801.75 2336.00 ▇▅▂▁▁
total_bsmt_sf 0 1 1051.26 440.97 0.00 793.00 990.00 1301.50 6110.00 ▇▃▁▁▁
first_flr_sf 0 1 1159.56 391.89 334.00 876.25 1084.00 1384.00 5095.00 ▇▃▁▁▁
second_flr_sf 0 1 335.46 428.40 0.00 0.00 0.00 703.75 2065.00 ▇▃▂▁▁
low_qual_fin_sf 0 1 4.68 46.31 0.00 0.00 0.00 0.00 1064.00 ▇▁▁▁▁
gr_liv_area 0 1 1499.69 505.51 334.00 1126.00 1442.00 1742.75 5642.00 ▇▇▁▁▁
bsmt_full_bath 0 1 0.43 0.52 0.00 0.00 0.00 1.00 3.00 ▇▆▁▁▁
bsmt_half_bath 0 1 0.06 0.25 0.00 0.00 0.00 0.00 2.00 ▇▁▁▁▁
full_bath 0 1 1.57 0.55 0.00 1.00 2.00 2.00 4.00 ▁▇▇▁▁
half_bath 0 1 0.38 0.50 0.00 0.00 0.00 1.00 2.00 ▇▁▅▁▁
bedroom_abv_gr 0 1 2.85 0.83 0.00 2.00 3.00 3.00 8.00 ▁▇▂▁▁
kitchen_abv_gr 0 1 1.04 0.21 0.00 1.00 1.00 1.00 3.00 ▁▇▁▁▁
tot_rms_abv_grd 0 1 6.44 1.57 2.00 5.00 6.00 7.00 15.00 ▁▇▂▁▁
fireplaces 0 1 0.60 0.65 0.00 0.00 1.00 1.00 4.00 ▇▇▁▁▁
garage_cars 0 1 1.77 0.76 0.00 1.00 2.00 2.00 5.00 ▅▇▂▁▁
garage_area 0 1 472.66 215.19 0.00 320.00 480.00 576.00 1488.00 ▃▇▃▁▁
wood_deck_sf 0 1 93.75 126.36 0.00 0.00 0.00 168.00 1424.00 ▇▁▁▁▁
open_porch_sf 0 1 47.53 67.48 0.00 0.00 27.00 70.00 742.00 ▇▁▁▁▁
enclosed_porch 0 1 23.01 64.14 0.00 0.00 0.00 0.00 1012.00 ▇▁▁▁▁
three_season_porch 0 1 2.59 25.14 0.00 0.00 0.00 0.00 508.00 ▇▁▁▁▁
screen_porch 0 1 16.00 56.09 0.00 0.00 0.00 0.00 576.00 ▇▁▁▁▁
pool_area 0 1 2.24 35.60 0.00 0.00 0.00 0.00 800.00 ▇▁▁▁▁
misc_val 0 1 50.64 566.34 0.00 0.00 0.00 0.00 17000.00 ▇▁▁▁▁
mo_sold 0 1 6.22 2.71 1.00 4.00 6.00 8.00 12.00 ▅▆▇▃▃
year_sold 0 1 2007.79 1.32 2006.00 2007.00 2008.00 2009.00 2010.00 ▇▇▇▇▃
sale_price 0 1 180796.06 79886.69 12789.00 129500.00 160000.00 213500.00 755000.00 ▇▇▁▁▁
longitude 0 1 -93.64 0.03 -93.69 -93.66 -93.64 -93.62 -93.58 ▅▅▇▆▁
latitude 0 1 42.03 0.02 41.99 42.02 42.03 42.05 42.06 ▂▂▇▇▇
log_sale_price 0 1 12.02 0.41 9.46 11.77 11.98 12.27 13.53 ▁▁▆▇▁

Look at the outcome variable, the price of houses.

ggplot(data = housing_data, aes(x = sale_price)) + geom_density()

ggplot(data = housing_data, aes(x = log_sale_price)) + geom_density()

Let’s see correlations of features and the outcome variable. Based on this, simple benchmark models can quickly be put together.

ggcorr(housing_data) # correlation plot, from GGally package,懒人福音

# 最后一行是我们的outcome varibale,颜色最深的是影响最深的因素
ggpairs(housing_data, columns = c("log_sale_price", "gr_liv_area", "tot_rms_abv_grd", "garage_area"))

# density 没有scale,只是给了一个整体分布情况

A baseline model

lm(log_sale_price ~ gr_liv_area + tot_rms_abv_grd, data = housing_data) %>% 
  tidy() %>% 
  kable(digits = 3) %>% 
  kable_styling(full_width = F)
term estimate std.error statistic p.value
(Intercept) 11.318 0.023 498.266 0
gr_liv_area 0.001 0.000 38.602 0
tot_rms_abv_grd -0.052 0.006 -9.006 0

Weird pattern: sales price decreases with the number of rooms? This is spurious and is caused by the high positive correlation between the feature variables. Univariate regressions have the intuitive signs。尽管线性回归是unbiased,但还是有可能存在high variance,比如在factors存在很高的colinearity的情况下。不要以极高的代价换取unbiased或者low baise,代价可能是很高的variance。 lasso就是一种penalized model, 在存在多重共线性的情况下, 选择其中一个variable

lm(log_sale_price ~ gr_liv_area, data = housing_data) %>% 
  tidy() %>% 
  kable(digits = 3) %>% 
  kable_styling(full_width = F)
term estimate std.error statistic p.value
(Intercept) 11.180 0.017 660.123 0
gr_liv_area 0.001 0.000 52.430 0
lm(log_sale_price ~ tot_rms_abv_grd, data = housing_data) %>%
  tidy() %>% 
  kable(digits = 3) %>% 
  kable_styling(full_width = F)
term estimate std.error statistic p.value
(Intercept) 11.199 0.028 405.172 0
tot_rms_abv_grd 0.128 0.004 30.627 0

Penalized methods offer a solution to these kinds of patterns.

Set up training and test (holdout) datasets

set.seed(1234)
training_ratio <- 0.7
train_indices <- createDataPartition( #只是一个设置过程,把数据分为大小两组了,还没train
  y = housing_data[["log_sale_price"]],
  times = 1,
  p = training_ratio,
  list = FALSE
) %>% as.vector()
data_train <- housing_data[train_indices, ]
data_test <- housing_data[-train_indices, ]

fit_control <- trainControl(method = "cv", number = 10)

Penalize large coefficients: the Ridge regression

The ridge regression adds a penalty term to the sum of squared residuals: the sum of squares of the regression coefficients. This puts a cost on having large coefficients. Result: biased but lower variance model. 把所有的feature都保留了,没有删减。it shrinks coefficients but it won’t set coefficients as zero.

features <- setdiff(names(housing_data), c("sale_price", "log_sale_price"))
# 除了因变量,其他的varibales都选一遍

First we are going to directly work with the glmnet package to estimate penalized models. Then we look at how this can be implemented through caret.

# glmnet needs inputs as a matrix. model.matrix: handles factor variables
# and glmnet会把所有的facor变成dummy,比如一个level为3 的factor,会被变成三个dummy
# -1: we do not need the intercept as glmnet will automatically include it
# ~. choose all variables
x_train <- model.matrix( ~ . -1, data_train[, features, with = FALSE])
dim(x_train)
## [1] 2053  309
# standardization of variables is automatically done by glmnet

# how much penalty do we want to apply? select with CV
lambda_grid <- 10^seq(2,-5,length=100)  # it's just a trial at first

set.seed(1234)
ridge_model <- glmnet(
  x = x_train, y = data_train[["log_sale_price"]], 
  family = "gaussian", # for continuous response: we are having regression here
  alpha = 0  # the ridge model
)

plot(ridge_model, xvar = "lambda") 

# 一条线是一个coefficient对应的variable (变形后的dummy),penalty越多,shrinkage越多,越聚拢,直至为零
# coefficients have been standardized so they can be compared: with more variables the coefficients have more influence on the outcome variable
# 如果像0靠拢得越慢,说明这个variable更加有影响力

Look at some individual coefficients.

# helper function to extract the coefficient sequence as a data.table
# 直接垂直截断
get_glmnet_coeff_sequence <- function(glmnet_model) {
  coeff_sequence <- coef(glmnet_model) %>% tidy() # 直接问coef是什么,这里的coef已经被转回原值了,而不是log值,所以不便于比较
  names(coeff_sequence) <- c("variable", "lambda_id", "value")

  lambdas <- tibble(
    lambda = glmnet_model$lambda, 
    lambda_id = paste0("s", 0:(length(glmnet_model$lambda) - 1))
  )
  
  dplyr::inner_join(coeff_sequence, lambdas, by = "lambda_id") 
}
ridge_coeffs <- get_glmnet_coeff_sequence(ridge_model)
## Warning: 'tidy.dgCMatrix' is deprecated.
## See help("Deprecated")
## Warning: 'tidy.dgTMatrix' is deprecated.
## See help("Deprecated")
selected_variables <- c("gr_liv_area", "tot_rms_abv_grd", "garage_area",  "kitchen_abv_gr")
ggplot(
  data = ridge_coeffs %>% filter(variable %in% selected_variables),
  aes(x = log(lambda), y = value)) +
    geom_line() +
  facet_wrap(~ variable, scales = "free_y", ncol = 1)

# 这个变量与其他变量之间有共线性,整体上说变量在往0靠拢,但单个变量并不是一定单减的。
# 在你增加penalty的时候,可能其中一个变量受到了惩罚,但另一个具有共线性的没有
# penalty is for some of the coefficients, not all of them.

We can use cross-validation to determine the optimal penalty term weight. Two lambda values marked on the plot: one with the minimal CV RMSE, the other is the simplest model (highest lambda) which contains the optimal lambda’s error within one standard deviation. That is, it gives the simplest model that is still “good enough”.

set.seed(1234)
ridge_model_cv <- cv.glmnet( # find the best lambda
  x = x_train, y = data_train[["log_sale_price"]], 
  family = "gaussian",
  alpha = 0,
  nfolds = 10 #一共会有100个lambda
)

best_lambda <- ridge_model_cv$lambda.min
message(paste0("The optimally chosen penalty parameter: ", best_lambda))
## The optimally chosen penalty parameter: 0.180129135344916
# 一个是高富帅,另一个是经济适用男(不完美但也够用)
highest_good_enough_lambda <- ridge_model_cv$lambda.1se
message(paste0("The highest good enough penalty parameter: ", highest_good_enough_lambda))
## The highest good enough penalty parameter: 0.875896216853839
plot(ridge_model_cv)

We can also use caret to estimate ridge models. This lets us compare it later to any other model estimated with caret, using, for example, cross-validation with exactly the same folds.

# ridge model
ridge_tune_grid <- expand.grid(
  "alpha" = c(0), # ridge model
  "lambda" = seq(0.05, 0.5, by = 0.025)
)

set.seed(857)
ridge_fit <- train(
  log_sale_price ~ . -sale_price,
  data = data_train,
  method = "glmnet", # you can change within train() to choose lamda group you wanna see
  preProcess = c("center", "scale"),
  tuneGrid = ridge_tune_grid,
  trControl = fit_control
)
ridge_fit
## glmnet 
## 
## 2053 samples
##   81 predictor
## 
## Pre-processing: centered (308), scaled (308) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1847, 1849, 1849, 1847, 1848, 1847, ... 
## Resampling results across tuning parameters:
## 
##   lambda  RMSE       Rsquared   MAE       
##   0.050   0.1464742  0.8707003  0.08654948
##   0.075   0.1446715  0.8732402  0.08641530
##   0.100   0.1436810  0.8746908  0.08668066
##   0.125   0.1431577  0.8755367  0.08716370
##   0.150   0.1429352  0.8760033  0.08772921
##   0.175   0.1429234  0.8762089  0.08830684
##   0.200   0.1430586  0.8762391  0.08891297
##   0.225   0.1433025  0.8761429  0.08951049
##   0.250   0.1436295  0.8759524  0.09012103
##   0.275   0.1440211  0.8756904  0.09074102
##   0.300   0.1444622  0.8753753  0.09135751
##   0.325   0.1449408  0.8750214  0.09196578
##   0.350   0.1454497  0.8746372  0.09257827
##   0.375   0.1459867  0.8742271  0.09318557
##   0.400   0.1465406  0.8737999  0.09377238
##   0.425   0.1471172  0.8733518  0.09436542
##   0.450   0.1477083  0.8728943  0.09495540
##   0.475   0.1483059  0.8724337  0.09554946
##   0.500   0.1489224  0.8719555  0.09615197
## 
## Tuning parameter 'alpha' was held constant at a value of 0
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 0 and lambda = 0.175.
ggplot(ridge_fit) 

Another variant: LASSO regression

While Ridge applies a constraint on the sum of squares of coefficients, LASSO does the same for the sum of the absolute values of coefficients.

This seemingly small difference has important sconsequences: some coefficients are set exactly to zero, others are only shrunk towards zero.

set.seed(1234)
lasso_model <- glmnet(
  x = x_train, y = data_train[["log_sale_price"]], 
  family = "gaussian",
  alpha = 1  # the lasso model; if 0 then ridge
)

plot(lasso_model, xvar = "lambda")

Again, let’s look at individual coefficients. We can see that some are set exactly to zero for higher values of the penalty term. This is in contrast to what we saw with the Ridge model.

lasso_coeffs <- get_glmnet_coeff_sequence(lasso_model)
## Warning: 'tidy.dgCMatrix' is deprecated.
## See help("Deprecated")
## Warning: 'tidy.dgTMatrix' is deprecated.
## See help("Deprecated")
selected_variables <- c("gr_liv_area", "tot_rms_abv_grd", "garage_area", "kitchen_abv_gr")
ggplot(
  data = lasso_coeffs %>% filter(variable %in% selected_variables),
  aes(x = log(lambda), y = value)) +
    geom_line() +
  facet_wrap(~ variable, scales = "free_y", ncol = 1)

Again, we can apply cross-validation to determine the optimal value for the penalty term.

set.seed(1234)
lasso_model_cv <- cv.glmnet(
  x = x_train, y = data_train[["log_sale_price"]], 
  family = "gaussian",
  alpha = 1,
  nfolds = 10
)

best_lambda <- lasso_model_cv$lambda.min
message(paste0("The optimally chosen penalty parameter: ", best_lambda))
## The optimally chosen penalty parameter: 0.00388076456762921
highest_good_enough_lambda <- lasso_model_cv$lambda.1se
message(paste0("The highest good enough penalty parameter: ", highest_good_enough_lambda))
## The highest good enough penalty parameter: 0.0227297188251623
plot(lasso_model_cv)

Fitting LASSO models with caret is similar to that of Ridge.

tenpowers <- 10^seq(-1, -5, by = -1)

lasso_tune_grid <- expand.grid(
  "alpha" = c(1),
  "lambda" = c(tenpowers, tenpowers / 2) 
)

set.seed(857)
lasso_fit <- train(
  log_sale_price ~ . -sale_price,
  data = data_train, 
  method = "glmnet",
  preProcess = c("center", "scale"),
  tuneGrid = lasso_tune_grid,
  trControl = fit_control
)
lasso_fit
## glmnet 
## 
## 2053 samples
##   81 predictor
## 
## Pre-processing: centered (308), scaled (308) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1847, 1849, 1849, 1847, 1848, 1847, ... 
## Resampling results across tuning parameters:
## 
##   lambda  RMSE       Rsquared   MAE       
##   5e-06   0.1724948  0.8316208  0.08760109
##   1e-05   0.1724948  0.8316208  0.08760109
##   5e-05   0.1719195  0.8325130  0.08753221
##   1e-04   0.1701065  0.8353641  0.08724015
##   5e-04   0.1608812  0.8493622  0.08610976
##   1e-03   0.1549066  0.8580135  0.08566133
##   5e-03   0.1481077  0.8664453  0.08711685
##   1e-02   0.1534121  0.8580125  0.09303712
##   5e-02   0.1970917  0.8002051  0.13039984
##   1e-01   0.2394941  0.7692260  0.16621033
## 
## Tuning parameter 'alpha' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 1 and lambda = 0.005.
ggplot(lasso_fit) + scale_x_log10()

Variable selection - why is it happening?

Source: Introduction to Statistical Learning The constraints are different: LASSO’s has corners, Ridge’s is smooth. Corners mean some coefficients being exactly zero.

Source: Introduction to Statistical Learning

Combine Ridge and LASSO: Elastic net

We can combine both types of penalties. LASSO is attractive since it performs principled variable selection. However, when having correlated features, typically only one of them - quite arbitrarily - is kept in the model. Ridge simultaneously shrinks coefficients of these towards zero. If we apply penalties of both the absolute values and the squares of the coefficients, both virtues are retained. This method is called Elastic net.

enet_tune_grid <- expand.grid(
  "alpha" = seq(0, 1, by = 0.1),
  "lambda" = union(lasso_tune_grid[["lambda"]], ridge_tune_grid[["lambda"]])
)

set.seed(857)
enet_fit <- train(
  log_sale_price ~ . -sale_price,
  data = data_train,
  method = "glmnet",
  preProcess = c("center", "scale"),
  tuneGrid = enet_tune_grid,
  trControl = fit_control
)
enet_fit
## glmnet 
## 
## 2053 samples
##   81 predictor
## 
## Pre-processing: centered (308), scaled (308) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1847, 1849, 1849, 1847, 1848, 1847, ... 
## Resampling results across tuning parameters:
## 
##   alpha  lambda    RMSE       Rsquared   MAE       
##   0.0    0.000005  0.1494346  0.8665309  0.08716389
##   0.0    0.000010  0.1494346  0.8665309  0.08716389
##   0.0    0.000050  0.1494346  0.8665309  0.08716389
##   0.0    0.000100  0.1494346  0.8665309  0.08716389
##   0.0    0.000500  0.1494346  0.8665309  0.08716389
##   0.0    0.001000  0.1494346  0.8665309  0.08716389
##   0.0    0.005000  0.1494346  0.8665309  0.08716389
##   0.0    0.010000  0.1494346  0.8665309  0.08716389
##   0.0    0.050000  0.1464742  0.8707003  0.08654948
##   0.0    0.075000  0.1446715  0.8732402  0.08641530
##   0.0    0.100000  0.1436810  0.8746908  0.08668066
##   0.0    0.125000  0.1431577  0.8755367  0.08716370
##   0.0    0.150000  0.1429352  0.8760033  0.08772921
##   0.0    0.175000  0.1429234  0.8762089  0.08830684
##   0.0    0.200000  0.1430586  0.8762391  0.08891297
##   0.0    0.225000  0.1433025  0.8761429  0.08951049
##   0.0    0.250000  0.1436295  0.8759524  0.09012103
##   0.0    0.275000  0.1440211  0.8756904  0.09074102
##   0.0    0.300000  0.1444622  0.8753753  0.09135751
##   0.0    0.325000  0.1449408  0.8750214  0.09196578
##   0.0    0.350000  0.1454497  0.8746372  0.09257827
##   0.0    0.375000  0.1459867  0.8742271  0.09318557
##   0.0    0.400000  0.1465406  0.8737999  0.09377238
##   0.0    0.425000  0.1471172  0.8733518  0.09436542
##   0.0    0.450000  0.1477083  0.8728943  0.09495540
##   0.0    0.475000  0.1483059  0.8724337  0.09554946
##   0.0    0.500000  0.1489224  0.8719555  0.09615197
##   0.1    0.000005  0.1670353  0.8403192  0.08758031
##   0.1    0.000010  0.1670353  0.8403192  0.08758031
##   0.1    0.000050  0.1670353  0.8403192  0.08758031
##   0.1    0.000100  0.1670353  0.8403192  0.08758031
##   0.1    0.000500  0.1652817  0.8430854  0.08759484
##   0.1    0.001000  0.1626288  0.8471310  0.08753583
##   0.1    0.005000  0.1537384  0.8599886  0.08648934
##   0.1    0.010000  0.1497553  0.8655036  0.08577203
##   0.1    0.050000  0.1449925  0.8724173  0.08752321
##   0.1    0.075000  0.1472820  0.8700995  0.09078731
##   0.1    0.100000  0.1510769  0.8657510  0.09442049
##   0.1    0.125000  0.1552411  0.8610822  0.09824876
##   0.1    0.150000  0.1591302  0.8570805  0.10182023
##   0.1    0.175000  0.1628752  0.8535522  0.10509368
##   0.1    0.200000  0.1667296  0.8500137  0.10831016
##   0.1    0.225000  0.1705956  0.8465929  0.11144985
##   0.1    0.250000  0.1744036  0.8434133  0.11449097
##   0.1    0.275000  0.1782365  0.8402647  0.11748185
##   0.1    0.300000  0.1821332  0.8370731  0.12043730
##   0.1    0.325000  0.1859880  0.8340924  0.12340008
##   0.1    0.350000  0.1898015  0.8312907  0.12637167
##   0.1    0.375000  0.1935818  0.8286476  0.12930385
##   0.1    0.400000  0.1973826  0.8260119  0.13228109
##   0.1    0.425000  0.2011854  0.8233597  0.13526745
##   0.1    0.450000  0.2049773  0.8207158  0.13827386
##   0.1    0.475000  0.2087777  0.8180422  0.14127767
##   0.1    0.500000  0.2124801  0.8156490  0.14421317
##   0.2    0.000005  0.1686982  0.8376346  0.08746792
##   0.2    0.000010  0.1686982  0.8376346  0.08746792
##   0.2    0.000050  0.1686982  0.8376346  0.08746792
##   0.2    0.000100  0.1686982  0.8376346  0.08746792
##   0.2    0.000500  0.1645699  0.8442078  0.08732577
##   0.2    0.001000  0.1612194  0.8492273  0.08703339
##   0.2    0.005000  0.1516590  0.8626889  0.08593693
##   0.2    0.010000  0.1468238  0.8694456  0.08539376
##   0.2    0.050000  0.1514324  0.8632664  0.09349835
##   0.2    0.075000  0.1582912  0.8549462  0.10037994
##   0.2    0.100000  0.1647081  0.8479959  0.10625892
##   0.2    0.125000  0.1712318  0.8413826  0.11168947
##   0.2    0.150000  0.1780718  0.8344974  0.11702932
##   0.2    0.175000  0.1847363  0.8282441  0.12218695
##   0.2    0.200000  0.1912079  0.8226110  0.12726045
##   0.2    0.225000  0.1977384  0.8169232  0.13226425
##   0.2    0.250000  0.2041532  0.8116526  0.13724719
##   0.2    0.275000  0.2103213  0.8073177  0.14212879
##   0.2    0.300000  0.2164762  0.8031795  0.14704757
##   0.2    0.325000  0.2226729  0.7989754  0.15202946
##   0.2    0.350000  0.2289062  0.7946446  0.15706538
##   0.2    0.375000  0.2349507  0.7908896  0.16208311
##   0.2    0.400000  0.2406921  0.7880980  0.16690543
##   0.2    0.425000  0.2463364  0.7857864  0.17172436
##   0.2    0.450000  0.2519374  0.7837543  0.17655693
##   0.2    0.475000  0.2575106  0.7818457  0.18143574
##   0.2    0.500000  0.2631246  0.7798631  0.18640559
##   0.3    0.000005  0.1694889  0.8363694  0.08742121
##   0.3    0.000010  0.1694889  0.8363694  0.08742121
##   0.3    0.000050  0.1694889  0.8363694  0.08742121
##   0.3    0.000100  0.1693230  0.8366335  0.08741713
##   0.3    0.000500  0.1640506  0.8449649  0.08710775
##   0.3    0.001000  0.1601442  0.8507472  0.08670681
##   0.3    0.005000  0.1497165  0.8652159  0.08569673
##   0.3    0.010000  0.1461510  0.8701295  0.08560862
##   0.3    0.050000  0.1582715  0.8536112  0.09981780
##   0.3    0.075000  0.1671751  0.8435546  0.10803163
##   0.3    0.100000  0.1766422  0.8333271  0.11561899
##   0.3    0.125000  0.1857002  0.8244686  0.12269191
##   0.3    0.150000  0.1950144  0.8152437  0.12978800
##   0.3    0.175000  0.2039044  0.8073794  0.13669108
##   0.3    0.200000  0.2125468  0.8005866  0.14354836
##   0.3    0.225000  0.2210106  0.7943273  0.15026194
##   0.3    0.250000  0.2289228  0.7900343  0.15675433
##   0.3    0.275000  0.2367623  0.7866223  0.16347045
##   0.3    0.300000  0.2446801  0.7833307  0.17036884
##   0.3    0.325000  0.2525252  0.7806005  0.17735528
##   0.3    0.350000  0.2603597  0.7782232  0.18438724
##   0.3    0.375000  0.2682727  0.7757639  0.19159587
##   0.3    0.400000  0.2762823  0.7729326  0.19892319
##   0.3    0.425000  0.2842662  0.7698784  0.20626618
##   0.3    0.450000  0.2922789  0.7664228  0.21360385
##   0.3    0.475000  0.3001680  0.7629534  0.22076395
##   0.3    0.500000  0.3078466  0.7601795  0.22767306
##   0.4    0.000005  0.1704846  0.8348458  0.08741585
##   0.4    0.000010  0.1704846  0.8348458  0.08741585
##   0.4    0.000050  0.1704846  0.8348458  0.08741585
##   0.4    0.000100  0.1696535  0.8361684  0.08736964
##   0.4    0.000500  0.1636200  0.8455879  0.08689064
##   0.4    0.001000  0.1595481  0.8515146  0.08646928
##   0.4    0.005000  0.1479330  0.8676950  0.08557608
##   0.4    0.010000  0.1467300  0.8689270  0.08627611
##   0.4    0.050000  0.1637869  0.8463652  0.10479141
##   0.4    0.075000  0.1757953  0.8326936  0.11464710
##   0.4    0.100000  0.1876609  0.8200835  0.12378325
##   0.4    0.125000  0.1993287  0.8083322  0.13276478
##   0.4    0.150000  0.2103536  0.7987837  0.14146575
##   0.4    0.175000  0.2205396  0.7919677  0.14968744
##   0.4    0.200000  0.2304581  0.7871512  0.15808533
##   0.4    0.225000  0.2405416  0.7826474  0.16696311
##   0.4    0.250000  0.2505703  0.7791898  0.17594585
##   0.4    0.275000  0.2608273  0.7755577  0.18529023
##   0.4    0.300000  0.2711586  0.7717192  0.19474537
##   0.4    0.325000  0.2815382  0.7674957  0.20427500
##   0.4    0.350000  0.2917695  0.7639305  0.21357868
##   0.4    0.375000  0.3019084  0.7607288  0.22272180
##   0.4    0.400000  0.3119532  0.7578097  0.23163290
##   0.4    0.425000  0.3219891  0.7534710  0.24046073
##   0.4    0.450000  0.3320417  0.7471823  0.24926727
##   0.4    0.475000  0.3420369  0.7378820  0.25797869
##   0.4    0.500000  0.3517341  0.7247923  0.26634789
##   0.5    0.000005  0.1708418  0.8342819  0.08737420
##   0.5    0.000010  0.1708418  0.8342819  0.08737420
##   0.5    0.000050  0.1708418  0.8342819  0.08737420
##   0.5    0.000100  0.1695925  0.8362622  0.08731825
##   0.5    0.000500  0.1632201  0.8461364  0.08679384
##   0.5    0.001000  0.1591060  0.8520758  0.08629427
##   0.5    0.005000  0.1470707  0.8688008  0.08550668
##   0.5    0.010000  0.1474686  0.8675998  0.08710968
##   0.5    0.050000  0.1695629  0.8386063  0.10935270
##   0.5    0.075000  0.1841805  0.8219876  0.12081526
##   0.5    0.100000  0.1985376  0.8064791  0.13187596
##   0.5    0.125000  0.2114672  0.7953057  0.14216486
##   0.5    0.150000  0.2233297  0.7884573  0.15205946
##   0.5    0.175000  0.2353497  0.7827136  0.16257297
##   0.5    0.200000  0.2473769  0.7784926  0.17331738
##   0.5    0.225000  0.2597029  0.7740997  0.18454370
##   0.5    0.250000  0.2723377  0.7689306  0.19602060
##   0.5    0.275000  0.2849918  0.7639630  0.20749237
##   0.5    0.300000  0.2975115  0.7602207  0.21871114
##   0.5    0.325000  0.3101928  0.7549867  0.23001788
##   0.5    0.350000  0.3229323  0.7472781  0.24129766
##   0.5    0.375000  0.3355561  0.7357114  0.25234518
##   0.5    0.400000  0.3480588  0.7176339  0.26315506
##   0.5    0.425000  0.3603446  0.6884627  0.27365906
##   0.5    0.450000  0.3715257  0.6576301  0.28316301
##   0.5    0.475000  0.3816336  0.6214163  0.29169979
##   0.5    0.500000  0.3900225  0.6099986  0.29872411
##   0.6    0.000005  0.1707480  0.8344097  0.08737420
##   0.6    0.000010  0.1707480  0.8344097  0.08737420
##   0.6    0.000050  0.1707511  0.8344121  0.08737744
##   0.6    0.000100  0.1696843  0.8361159  0.08730022
##   0.6    0.000500  0.1627459  0.8468054  0.08669446
##   0.6    0.001000  0.1581294  0.8534760  0.08603425
##   0.6    0.005000  0.1468823  0.8689147  0.08563244
##   0.6    0.010000  0.1482474  0.8663156  0.08812014
##   0.6    0.050000  0.1753692  0.8307643  0.11395467
##   0.6    0.075000  0.1926331  0.8105966  0.12714755
##   0.6    0.100000  0.2079225  0.7961092  0.13922013
##   0.6    0.125000  0.2217196  0.7873250  0.15063365
##   0.6    0.150000  0.2355530  0.7806855  0.16271013
##   0.6    0.175000  0.2496284  0.7755672  0.17528103
##   0.6    0.200000  0.2644768  0.7687477  0.18883049
##   0.6    0.225000  0.2792485  0.7634871  0.20218177
##   0.6    0.250000  0.2941244  0.7586748  0.21561758
##   0.6    0.275000  0.3093384  0.7510517  0.22924629
##   0.6    0.300000  0.3247103  0.7390044  0.24280429
##   0.6    0.325000  0.3401681  0.7184073  0.25623974
##   0.6    0.350000  0.3552638  0.6859485  0.26917181
##   0.6    0.375000  0.3689691  0.6506777  0.28091486
##   0.6    0.400000  0.3811593  0.6164726  0.29128245
##   0.6    0.425000  0.3918545  0.5960832  0.30026817
##   0.6    0.450000  0.4008493  0.5904565  0.30777879
##   0.6    0.475000  0.4067417        NaN  0.31268761
##   0.6    0.500000  0.4067417        NaN  0.31268761
##   0.7    0.000005  0.1716968  0.8329255  0.08750992
##   0.7    0.000010  0.1716968  0.8329255  0.08750992
##   0.7    0.000050  0.1715011  0.8332347  0.08750053
##   0.7    0.000100  0.1699035  0.8357798  0.08731974
##   0.7    0.000500  0.1623243  0.8473529  0.08657680
##   0.7    0.001000  0.1571805  0.8548542  0.08587782
##   0.7    0.005000  0.1470432  0.8684973  0.08590511
##   0.7    0.010000  0.1492584  0.8646990  0.08925266
##   0.7    0.050000  0.1809722  0.8231889  0.11816131
##   0.7    0.075000  0.2000806  0.8012626  0.13291098
##   0.7    0.100000  0.2161021  0.7885254  0.14579813
##   0.7    0.125000  0.2316678  0.7801380  0.15923322
##   0.7    0.150000  0.2476494  0.7737710  0.17346477
##   0.7    0.175000  0.2646161  0.7655558  0.18895086
##   0.7    0.200000  0.2814634  0.7603362  0.20414602
##   0.7    0.225000  0.2989461  0.7524448  0.21986790
##   0.7    0.250000  0.3169027  0.7391561  0.23576701
##   0.7    0.275000  0.3351881  0.7144350  0.25174302
##   0.7    0.300000  0.3529382  0.6761303  0.26699997
##   0.7    0.325000  0.3689497  0.6366692  0.28088389
##   0.7    0.350000  0.3830308  0.6089050  0.29287926
##   0.7    0.375000  0.3955795  0.5838639  0.30338405
##   0.7    0.400000  0.4059476  0.5397024  0.31203457
##   0.7    0.425000  0.4067417        NaN  0.31268761
##   0.7    0.450000  0.4067417        NaN  0.31268761
##   0.7    0.475000  0.4067417        NaN  0.31268761
##   0.7    0.500000  0.4067417        NaN  0.31268761
##   0.8    0.000005  0.1721038  0.8322803  0.08758002
##   0.8    0.000010  0.1721038  0.8322803  0.08758002
##   0.8    0.000050  0.1717511  0.8328301  0.08754194
##   0.8    0.000100  0.1700496  0.8355322  0.08730294
##   0.8    0.000500  0.1618481  0.8480116  0.08641227
##   0.8    0.001000  0.1563228  0.8560612  0.08577434
##   0.8    0.005000  0.1473617  0.8678215  0.08624707
##   0.8    0.010000  0.1505587  0.8626032  0.09045754
##   0.8    0.050000  0.1866407  0.8151235  0.12233824
##   0.8    0.075000  0.2066492  0.7935612  0.13805376
##   0.8    0.100000  0.2240920  0.7813816  0.15255193
##   0.8    0.125000  0.2415457  0.7736673  0.16796514
##   0.8    0.150000  0.2603165  0.7643673  0.18497090
##   0.8    0.175000  0.2792342  0.7579511  0.20200595
##   0.8    0.200000  0.2991234  0.7474932  0.21986302
##   0.8    0.225000  0.3198053  0.7275671  0.23816684
##   0.8    0.250000  0.3405778  0.6927868  0.25626064
##   0.8    0.275000  0.3600881  0.6489638  0.27321277
##   0.8    0.300000  0.3775321  0.6108297  0.28825338
##   0.8    0.325000  0.3931478  0.5765837  0.30137630
##   0.8    0.350000  0.4058937  0.5300012  0.31199095
##   0.8    0.375000  0.4067417        NaN  0.31268761
##   0.8    0.400000  0.4067417        NaN  0.31268761
##   0.8    0.425000  0.4067417        NaN  0.31268761
##   0.8    0.450000  0.4067417        NaN  0.31268761
##   0.8    0.475000  0.4067417        NaN  0.31268761
##   0.8    0.500000  0.4067417        NaN  0.31268761
##   0.9    0.000005  0.1721561  0.8321641  0.08755384
##   0.9    0.000010  0.1721561  0.8321641  0.08755384
##   0.9    0.000050  0.1717024  0.8328740  0.08751928
##   0.9    0.000100  0.1700485  0.8354975  0.08726181
##   0.9    0.000500  0.1613353  0.8487311  0.08624674
##   0.9    0.001000  0.1555381  0.8571492  0.08568712
##   0.9    0.005000  0.1477188  0.8671369  0.08664239
##   0.9    0.010000  0.1519674  0.8603235  0.09171477
##   0.9    0.050000  0.1920288  0.8073659  0.12643592
##   0.9    0.075000  0.2128232  0.7867296  0.14303505
##   0.9    0.100000  0.2316538  0.7757767  0.15919885
##   0.9    0.125000  0.2518989  0.7648363  0.17729858
##   0.9    0.150000  0.2724298  0.7579420  0.19582998
##   0.9    0.175000  0.2945061  0.7459709  0.21563183
##   0.9    0.200000  0.3178321  0.7213501  0.23631602
##   0.9    0.225000  0.3411620  0.6810363  0.25670865
##   0.9    0.250000  0.3628911  0.6335234  0.27571669
##   0.9    0.275000  0.3823698  0.5968865  0.29237031
##   0.9    0.300000  0.3995478  0.5686065  0.30670381
##   0.9    0.325000  0.4067417        NaN  0.31268761
##   0.9    0.350000  0.4067417        NaN  0.31268761
##   0.9    0.375000  0.4067417        NaN  0.31268761
##   0.9    0.400000  0.4067417        NaN  0.31268761
##   0.9    0.425000  0.4067417        NaN  0.31268761
##   0.9    0.450000  0.4067417        NaN  0.31268761
##   0.9    0.475000  0.4067417        NaN  0.31268761
##   0.9    0.500000  0.4067417        NaN  0.31268761
##   1.0    0.000005  0.1724948  0.8316208  0.08760109
##   1.0    0.000010  0.1724948  0.8316208  0.08760109
##   1.0    0.000050  0.1719195  0.8325130  0.08753221
##   1.0    0.000100  0.1701065  0.8353641  0.08724015
##   1.0    0.000500  0.1608812  0.8493622  0.08610976
##   1.0    0.001000  0.1549066  0.8580135  0.08566133
##   1.0    0.005000  0.1481077  0.8664453  0.08711685
##   1.0    0.010000  0.1534121  0.8580125  0.09303712
##   1.0    0.050000  0.1970917  0.8002051  0.13039984
##   1.0    0.075000  0.2187342  0.7803134  0.14795038
##   1.0    0.100000  0.2394941  0.7692260  0.16621033
##   1.0    0.125000  0.2615068  0.7600852  0.18595485
##   1.0    0.150000  0.2852745  0.7481946  0.20723154
##   1.0    0.175000  0.3109955  0.7219131  0.23014813
##   1.0    0.200000  0.3369360  0.6800482  0.25293518
##   1.0    0.225000  0.3613613  0.6292399  0.27442996
##   1.0    0.250000  0.3835125  0.5855161  0.29335653
##   1.0    0.275000  0.4029942  0.5599177  0.30958250
##   1.0    0.300000  0.4067417        NaN  0.31268761
##   1.0    0.325000  0.4067417        NaN  0.31268761
##   1.0    0.350000  0.4067417        NaN  0.31268761
##   1.0    0.375000  0.4067417        NaN  0.31268761
##   1.0    0.400000  0.4067417        NaN  0.31268761
##   1.0    0.425000  0.4067417        NaN  0.31268761
##   1.0    0.450000  0.4067417        NaN  0.31268761
##   1.0    0.475000  0.4067417        NaN  0.31268761
##   1.0    0.500000  0.4067417        NaN  0.31268761
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 0 and lambda = 0.175.
ggplot(enet_fit) + scale_x_log10()
## Warning: The shape palette can deal with a maximum of 6 discrete values because
## more than 6 becomes difficult to discriminate; you have 11. Consider
## specifying shapes manually if you must have them.
## Warning: Removed 135 rows containing missing values (geom_point).

Evaluate all models

For completeness, estimate the non-regularized linear model as well.

set.seed(857)
linear_fit <- train(
  log_sale_price ~ . -sale_price,
  data = data_train,
  method = "lm",
  preProcess = c("center", "scale"),
  trControl = fit_control
)
resample_profile <- resamples(
  list("linear" = linear_fit,
       "ridge" = ridge_fit,
       "lasso" = lasso_fit,
       "elastic net" = enet_fit
  )
) 

summary(resample_profile)
## 
## Call:
## summary.resamples(object = resample_profile)
## 
## Models: linear, ridge, lasso, elastic net 
## Number of resamples: 10 
## 
## MAE 
##                   Min.    1st Qu.     Median       Mean    3rd Qu.      Max.
## linear      0.07777197 0.07940461 0.09265443 0.08873995 0.09527491 0.0987329
## ridge       0.07367948 0.08026915 0.08802844 0.08830684 0.09413163 0.1061065
## lasso       0.07086193 0.08087593 0.08594562 0.08711685 0.09203586 0.1047683
## elastic net 0.07367948 0.08026915 0.08802844 0.08830684 0.09413163 0.1061065
##             NA's
## linear         0
## ridge          0
## lasso          0
## elastic net    0
## 
## RMSE 
##                   Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## linear      0.11238141 0.1364270 0.1721632 0.1771764 0.2204885 0.2404686    0
## ridge       0.09860469 0.1175353 0.1319301 0.1429234 0.1472833 0.2196785    0
## lasso       0.09536545 0.1233744 0.1333620 0.1481077 0.1520182 0.2372004    0
## elastic net 0.09860469 0.1175353 0.1319301 0.1429234 0.1472833 0.2196785    0
## 
## Rsquared 
##                  Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## linear      0.7147800 0.7634893 0.8286702 0.8249801 0.8875117 0.9207508    0
## ridge       0.7488259 0.8582377 0.8939727 0.8762089 0.9185868 0.9253607    0
## lasso       0.7119918 0.8498486 0.8874229 0.8664453 0.9117261 0.9298672    0
## elastic net 0.7488259 0.8582377 0.8939727 0.8762089 0.9185868 0.9253607    0
bwplot(resample_profile)

Are differences between models large in a statistical sense? Not really. What we can certainly see is that the plain linear model’s variance is much larger than that of the penalized ones.

model_differences <- diff(resample_profile)
summary(model_differences)
## 
## Call:
## summary.diff.resamples(object = model_differences)
## 
## p-value adjustment: bonferroni 
## Upper diagonal: estimates of the difference
## Lower diagonal: p-value for H0: difference = 0
## 
## MAE 
##             linear ridge      lasso      elastic net
## linear              0.0004331  0.0016231  0.0004331 
## ridge       1                  0.0011900  0.0000000 
## lasso       1      1                     -0.0011900 
## elastic net 1      NA         1                     
## 
## RMSE 
##             linear  ridge     lasso     elastic net
## linear               0.034253  0.029069  0.034253  
## ridge       0.09586           -0.005184  0.000000  
## lasso       0.24831 0.49739              0.005184  
## elastic net 0.09586 NA        0.49739              
## 
## Rsquared 
##             linear ridge     lasso     elastic net
## linear             -0.051229 -0.041465 -0.051229  
## ridge       0.1170            0.009764  0.000000  
## lasso       0.3528 0.3374              -0.009764  
## elastic net 0.1170 NA        0.3374
dotplot(model_differences)

Evaluate the chosen model on holdout set

RMSE(predict(enet_fit, newdata = data_test), data_test[["log_sale_price"]])
## [1] 0.1268464